林嶔 (Lin, Chin)
Lesson 3
– 這份資料是描述每個人疾病狀況的檔案,我們希望將這份直式資料轉為橫式資料
dat = read.csv("data3_1.csv", header = TRUE, fileEncoding = 'CP950')
head(dat, 10)
## ID Disease
## 1 n CKD
## 2 f DM
## 3 m HTN
## 4 w DM
## 5 u DM
## 6 j HTN
## 7 d CKD
## 8 l CKD
## 9 y HTN
## 10 w HTN
## ID CKD Depression DM HTN
## 1 a TRUE TRUE TRUE TRUE
## 2 b TRUE TRUE TRUE FALSE
## 3 c FALSE TRUE TRUE FALSE
## 4 d TRUE FALSE FALSE TRUE
## 5 e TRUE TRUE FALSE FALSE
## 6 f TRUE FALSE TRUE TRUE
– 請各位先回想在前面兩節課學過哪些函數、功能?我保證只使用已經學會的功能就足以應付這個問題了。
函數「length()」
函數「levels()」
各式索引函數
迴圈函數「for」
…
在最開始的時候,我們要先對這份資料表做初步分析。這份資料表有幾個人?有幾種可能的疾病?
我們很明顯是需要用到函數「levels()」以及函數「length()」的組合
– 而函數「levels()」僅能用在因子向量上,所以我們要先確認ID及Disease的屬性
class(dat[,1])
## [1] "factor"
class(dat[,2])
## [1] "factor"
– 很幸運的,在檔案讀進來的時候兩者就都已經是因子向量了,如果不是的話,記得要使用函數「as.factor()」進行轉換
dat[,1] = as.factor(dat[,1])
dat[,2] = as.factor(dat[,2])
levels.sample = levels(dat[,1])
levels.sample
## [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "p" "r" "s"
## [18] "t" "u" "v" "w" "x" "y" "z"
n.sample = length(levels.sample)
n.sample
## [1] 24
levels.disease = levels(dat[,2])
levels.disease
## [1] "CKD" "Depression" "DM" "HTN"
n.disease = length(levels.disease)
n.disease
## [1] 4
– 這裡我們會用到函數「matrix」做一個空的矩陣,在最開始的時候我們可以先在矩陣內都填上0(填什麼並不重要,因為等等都會覆蓋掉)
new.dat = matrix(0, nrow = n.sample, ncol = n.disease+1)
colnames(new.dat) = c("ID", levels.disease)
new.dat[,1] = levels.sample
new.dat
## ID CKD Depression DM HTN
## [1,] "a" "0" "0" "0" "0"
## [2,] "b" "0" "0" "0" "0"
## [3,] "c" "0" "0" "0" "0"
## [4,] "d" "0" "0" "0" "0"
## [5,] "e" "0" "0" "0" "0"
## [6,] "f" "0" "0" "0" "0"
## [7,] "g" "0" "0" "0" "0"
## [8,] "h" "0" "0" "0" "0"
## [9,] "i" "0" "0" "0" "0"
## [10,] "j" "0" "0" "0" "0"
## [11,] "k" "0" "0" "0" "0"
## [12,] "l" "0" "0" "0" "0"
## [13,] "m" "0" "0" "0" "0"
## [14,] "n" "0" "0" "0" "0"
## [15,] "p" "0" "0" "0" "0"
## [16,] "r" "0" "0" "0" "0"
## [17,] "s" "0" "0" "0" "0"
## [18,] "t" "0" "0" "0" "0"
## [19,] "u" "0" "0" "0" "0"
## [20,] "v" "0" "0" "0" "0"
## [21,] "w" "0" "0" "0" "0"
## [22,] "x" "0" "0" "0" "0"
## [23,] "y" "0" "0" "0" "0"
## [24,] "z" "0" "0" "0" "0"
i = 1
dat[dat[,1]==levels.sample[i],]
## ID Disease
## 11 a DM
## 18 a CKD
## 45 a Depression
## 49 a HTN
dat[dat[,1]==levels.sample[i],2]
## [1] DM CKD Depression HTN
## Levels: CKD Depression DM HTN
levels.disease %in% dat[dat[,1]==levels.sample[i],2]
## [1] TRUE TRUE TRUE TRUE
new.dat[i,-1] = levels.disease %in% dat[dat[,1]==levels.sample[i],2]
new.dat
## ID CKD Depression DM HTN
## [1,] "a" "TRUE" "TRUE" "TRUE" "TRUE"
## [2,] "b" "0" "0" "0" "0"
## [3,] "c" "0" "0" "0" "0"
## [4,] "d" "0" "0" "0" "0"
## [5,] "e" "0" "0" "0" "0"
## [6,] "f" "0" "0" "0" "0"
## [7,] "g" "0" "0" "0" "0"
## [8,] "h" "0" "0" "0" "0"
## [9,] "i" "0" "0" "0" "0"
## [10,] "j" "0" "0" "0" "0"
## [11,] "k" "0" "0" "0" "0"
## [12,] "l" "0" "0" "0" "0"
## [13,] "m" "0" "0" "0" "0"
## [14,] "n" "0" "0" "0" "0"
## [15,] "p" "0" "0" "0" "0"
## [16,] "r" "0" "0" "0" "0"
## [17,] "s" "0" "0" "0" "0"
## [18,] "t" "0" "0" "0" "0"
## [19,] "u" "0" "0" "0" "0"
## [20,] "v" "0" "0" "0" "0"
## [21,] "w" "0" "0" "0" "0"
## [22,] "x" "0" "0" "0" "0"
## [23,] "y" "0" "0" "0" "0"
## [24,] "z" "0" "0" "0" "0"
## ID CKD Depression DM HTN
## 1 a TRUE TRUE TRUE TRUE
## 2 b TRUE TRUE TRUE FALSE
## 3 c FALSE TRUE TRUE FALSE
## 4 d TRUE FALSE FALSE TRUE
## 5 e TRUE TRUE FALSE FALSE
## 6 f TRUE FALSE TRUE TRUE
## 7 g TRUE TRUE TRUE FALSE
## 8 h TRUE TRUE FALSE TRUE
## 9 i FALSE TRUE TRUE TRUE
## 10 j FALSE FALSE FALSE TRUE
## 11 k TRUE FALSE FALSE FALSE
## 12 l TRUE FALSE FALSE FALSE
## 13 m TRUE TRUE FALSE TRUE
## 14 n TRUE TRUE FALSE TRUE
## 15 p TRUE TRUE FALSE TRUE
## 16 r FALSE FALSE TRUE TRUE
## 17 s TRUE FALSE TRUE FALSE
## 18 t TRUE TRUE FALSE FALSE
## 19 u TRUE TRUE TRUE FALSE
## 20 v FALSE FALSE TRUE TRUE
## 21 w FALSE FALSE TRUE TRUE
## 22 x TRUE FALSE FALSE TRUE
## 23 y TRUE FALSE TRUE TRUE
## 24 z TRUE TRUE FALSE FALSE
– 這份資料相較於上一份有些微的不同,但處理起來邏輯是類似的
– 請各位試著將這份資料轉換可分析的樣式
## 問卷編號 Disease名稱1 Disease名稱2 Disease名稱3
## 1 1
## 2 2 高血壓 C肝
## 3 3 C肝 腎結石
## 4 4 高血壓 氣喘 痛風
## 5 5 高血壓
## 6 6 腎功能不全
## 7 7 高血壓 心臟病
## 8 8 高血壓 心臟病 B肝
## 9 9 心臟病
## 10 10 高血壓 糖尿病
dat = read.csv("data3_1.csv", header = TRUE, fileEncoding = 'CP950')
levels.disease = levels(dat[,2])
levels.sample = levels(dat[,1])
new.dat = matrix(0, nrow = length(levels.sample), ncol = length(levels.disease)+1)
colnames(new.dat) = c("ID", levels.disease)
new.dat[,1] = levels.sample
for (i in 1:length(levels.sample)) {
new.dat[i,-1] = levels.disease%in%dat[dat[,1]==levels.sample[i],2]
}
new.dat = data.frame(new.dat)
new.dat
– 如果直接合併會出問題,必須先轉換成文字格式才能合併
dat = read.csv("data3_2.csv", header = TRUE, fileEncoding = 'CP950')
dat[,2] = as.character(dat[,2])
dat[,3] = as.character(dat[,3])
dat[,4] = as.character(dat[,4])
all_disease = c(dat[,2], dat[,3], dat[,4])
lvl.disease = levels(factor(all_disease))
– 或者是,在讀檔時直接把文字讀成非因子格式
dat = read.csv("data3_2.csv", header = TRUE, fileEncoding = 'CP950', stringsAsFactors = FALSE)
all_disease = c(dat[,2], dat[,3], dat[,4])
lvl.disease = levels(factor(all_disease))
lvl.disease = lvl.disease[-1]
lvl.sample = levels(factor(dat[,1]))
new.dat = matrix(0, nrow = length(lvl.sample), ncol = length(lvl.disease)+1)
colnames(new.dat) = c("ID", lvl.disease)
new.dat[,1] = lvl.sample
for (i in 1:length(lvl.disease)) {
new.dat[i,-1] = lvl.disease %in% dat[dat[,1]==lvl.sample[i],2:4]
}
new.dat = data.frame(new.dat)
new.dat
– 當然,之後也許你會看到一些函數能加速整個流程,但在面對小筆資料時熟練的使用迴圈能幫忙我們迅速做完資料轉換
– 這份資料是從三軍總醫院生化檢驗值系統截取某10位病患在這段期間內所測得之各式生化值
dat = read.csv("data3_3.csv", header = TRUE, fileEncoding = 'CP950')
head(dat, 10)
## PATNUMBER COLLECTIONDATE TESTNAME RESVALUE
## 1 2185 2011/12/12 上午 8:09:00 Creatinine 7.0
## 2 2185 2011/12/12 上午 8:09:00 Total Calcium 7.1
## 3 2185 2011/12/12 上午 8:09:00 Na 137.0
## 4 2185 2011/12/12 上午 8:09:00 IP 7.9
## 5 691 2011/12/12 下午 6:32:00 Creatinine 3.1
## 6 2185 2011/12/29 上午 6:19:00 Creatinine 7.2
## 7 2185 2011/12/29 上午 6:19:00 Na 136.0
## 8 691 2011/12/19 上午 4:38:00 Creatinine 8.0
## 9 691 2011/12/19 上午 4:38:00 Na 137.0
## 10 2185 2011/12/19 上午 8:47:00 Creatinine 8.1
## PATNUMBER COLLECTIONDATE Albumin Albumin body fluid AST
## [1,] "175" "2011/10/1 上午 8:24:00" NA NA NA
## [2,] "175" "2011/10/5 下午 4:46:00" NA NA NA
## [3,] "175" "2011/10/6 上午 9:01:00" NA NA NA
## [4,] "175" "2011/10/8 上午 6:42:00" NA NA NA
## [5,] "175" "2011/11/10 上午 9:01:00" NA NA NA
## [6,] "175" "2011/11/10 下午 1:25:00" NA NA NA
## BUN BUN Fluid Cholesterol Fluid Creatinine Creatinine Fluid GLU(AC)
## [1,] NA NA NA "3.7" NA NA
## [2,] NA NA NA "3.2" NA NA
## [3,] NA NA NA NA NA NA
## [4,] NA NA NA "3.4" NA NA
## [5,] NA NA NA NA NA NA
## [6,] NA NA NA NA NA NA
## HDL-Cholesterol IP K LDL-Cholesterol Na Total Calcium
## [1,] NA "4.3" NA NA "138" "7.3"
## [2,] NA NA NA NA "139" NA
## [3,] NA "4.5" NA NA NA "7.8"
## [4,] NA NA NA NA NA NA
## [5,] NA "4.5" NA NA NA "7.3"
## [6,] NA NA NA NA NA NA
## Total Cholesterol Triglyceride Triglycerol Fluid Uric Acid
## [1,] NA NA NA NA
## [2,] NA NA NA NA
## [3,] "342" "335" NA NA
## [4,] NA NA NA NA
## [5,] "342" "326" NA NA
## [6,] NA NA NA NA
## urine Calcium urine Phosphorus urine Potassium urine Sodium
## [1,] NA NA NA NA
## [2,] NA NA NA NA
## [3,] NA NA NA NA
## [4,] NA NA NA NA
## [5,] NA NA NA NA
## [6,] "0.6" "28.3" "39.1" "48"
## urine Uric Acid
## [1,] NA
## [2,] NA
## [3,] NA
## [4,] NA
## [5,] NA
## [6,] NA
– 一樣,先取得這份資料的基本資訊
levels.TESTNAME = levels(dat[,3])
levels.TESTNAME
## [1] "Albumin" "Albumin body fluid" "AST"
## [4] "BUN" "BUN Fluid" "Cholesterol Fluid"
## [7] "Creatinine" "Creatinine Fluid" "GLU(AC)"
## [10] "HDL-Cholesterol" "IP" "K"
## [13] "LDL-Cholesterol" "Na" "Total Calcium"
## [16] "Total Cholesterol" "Triglyceride" "Triglycerol Fluid"
## [19] "Uric Acid" "urine Calcium" "urine Phosphorus"
## [22] "urine Potassium" "urine Sodium" "urine Uric Acid"
n.TESTNAME = length(levels.TESTNAME)
n.TESTNAME
## [1] 24
levels.PATNUMBER = levels(as.factor(dat[,1]))
levels.PATNUMBER
## [1] "175" "356" "691" "1332" "1350" "1654" "1826" "2074" "2154" "2185"
n.PATNUMBER = length(levels.PATNUMBER)
n.PATNUMBER
## [1] 10
– 在寫迴圈時,我習慣先在起頭令迴圈變數為1,如果這段以後能執行,那應該整個迴圈都不會有問題
i = 1
subdat = dat[dat[,1]==levels.PATNUMBER[i],]
levels.COLLECTIONDATE = levels(subdat[,2])
n.COLLECTIONDATE = length(levels.COLLECTIONDATE)
n.COLLECTIONDATE
## [1] 1532
subdat[,2] = as.factor(as.character(subdat[,2]))
levels.COLLECTIONDATE = levels(subdat[,2])
n.COLLECTIONDATE = length(levels.COLLECTIONDATE)
n.COLLECTIONDATE
## [1] 132
– 第一欄填ID,第二欄填上這個人所有測量的時間點
submatrix = matrix(NA, nrow = n.COLLECTIONDATE, ncol = n.TESTNAME+2)
colnames(submatrix) = c("PATNUMBER", "COLLECTIONDATE", levels.TESTNAME)
submatrix[,1] = levels.PATNUMBER[i]
submatrix[,2] = levels.COLLECTIONDATE
head(submatrix)
## PATNUMBER COLLECTIONDATE Albumin Albumin body fluid AST
## [1,] "175" "2011/10/1 上午 8:24:00" NA NA NA
## [2,] "175" "2011/10/5 下午 4:46:00" NA NA NA
## [3,] "175" "2011/10/6 上午 9:01:00" NA NA NA
## [4,] "175" "2011/10/8 上午 6:42:00" NA NA NA
## [5,] "175" "2011/11/10 上午 9:01:00" NA NA NA
## [6,] "175" "2011/11/10 下午 1:25:00" NA NA NA
## BUN BUN Fluid Cholesterol Fluid Creatinine Creatinine Fluid GLU(AC)
## [1,] NA NA NA NA NA NA
## [2,] NA NA NA NA NA NA
## [3,] NA NA NA NA NA NA
## [4,] NA NA NA NA NA NA
## [5,] NA NA NA NA NA NA
## [6,] NA NA NA NA NA NA
## HDL-Cholesterol IP K LDL-Cholesterol Na Total Calcium
## [1,] NA NA NA NA NA NA
## [2,] NA NA NA NA NA NA
## [3,] NA NA NA NA NA NA
## [4,] NA NA NA NA NA NA
## [5,] NA NA NA NA NA NA
## [6,] NA NA NA NA NA NA
## Total Cholesterol Triglyceride Triglycerol Fluid Uric Acid
## [1,] NA NA NA NA
## [2,] NA NA NA NA
## [3,] NA NA NA NA
## [4,] NA NA NA NA
## [5,] NA NA NA NA
## [6,] NA NA NA NA
## urine Calcium urine Phosphorus urine Potassium urine Sodium
## [1,] NA NA NA NA
## [2,] NA NA NA NA
## [3,] NA NA NA NA
## [4,] NA NA NA NA
## [5,] NA NA NA NA
## [6,] NA NA NA NA
## urine Uric Acid
## [1,] NA
## [2,] NA
## [3,] NA
## [4,] NA
## [5,] NA
## [6,] NA
– 同樣的,我們先看第一個時間點,注意迴圈變數不要重複,所以剛剛設i,這次我們設j
j = 1
subsubdat = subdat[subdat[,2]==levels.COLLECTIONDATE[j],]
subsubdat
## PATNUMBER COLLECTIONDATE TESTNAME RESVALUE
## 3993 175 2011/10/1 上午 8:24:00 IP 4.3
## 3994 175 2011/10/1 上午 8:24:00 Total Calcium 7.3
## 3995 175 2011/10/1 上午 8:24:00 Creatinine 3.7
## 3996 175 2011/10/1 上午 8:24:00 Na 138.0
– 函數「which()」可以幫我們找位置
k = 1
NAME = subsubdat[k,3]
NAME
## [1] IP
## 24 Levels: Albumin Albumin body fluid AST BUN ... urine Uric Acid
position = which(NAME == levels.TESTNAME) + 2
position
## [1] 13
submatrix[j, position] = subsubdat[k,4]
head(submatrix)
## PATNUMBER COLLECTIONDATE Albumin Albumin body fluid AST
## [1,] "175" "2011/10/1 上午 8:24:00" NA NA NA
## [2,] "175" "2011/10/5 下午 4:46:00" NA NA NA
## [3,] "175" "2011/10/6 上午 9:01:00" NA NA NA
## [4,] "175" "2011/10/8 上午 6:42:00" NA NA NA
## [5,] "175" "2011/11/10 上午 9:01:00" NA NA NA
## [6,] "175" "2011/11/10 下午 1:25:00" NA NA NA
## BUN BUN Fluid Cholesterol Fluid Creatinine Creatinine Fluid GLU(AC)
## [1,] NA NA NA NA NA NA
## [2,] NA NA NA NA NA NA
## [3,] NA NA NA NA NA NA
## [4,] NA NA NA NA NA NA
## [5,] NA NA NA NA NA NA
## [6,] NA NA NA NA NA NA
## HDL-Cholesterol IP K LDL-Cholesterol Na Total Calcium
## [1,] NA "4.3" NA NA NA NA
## [2,] NA NA NA NA NA NA
## [3,] NA NA NA NA NA NA
## [4,] NA NA NA NA NA NA
## [5,] NA NA NA NA NA NA
## [6,] NA NA NA NA NA NA
## Total Cholesterol Triglyceride Triglycerol Fluid Uric Acid
## [1,] NA NA NA NA
## [2,] NA NA NA NA
## [3,] NA NA NA NA
## [4,] NA NA NA NA
## [5,] NA NA NA NA
## [6,] NA NA NA NA
## urine Calcium urine Phosphorus urine Potassium urine Sodium
## [1,] NA NA NA NA
## [2,] NA NA NA NA
## [3,] NA NA NA NA
## [4,] NA NA NA NA
## [5,] NA NA NA NA
## [6,] NA NA NA NA
## urine Uric Acid
## [1,] NA
## [2,] NA
## [3,] NA
## [4,] NA
## [5,] NA
## [6,] NA
subsubdat #先看看subsubdat裡面有哪些
## PATNUMBER COLLECTIONDATE TESTNAME RESVALUE
## 3993 175 2011/10/1 上午 8:24:00 IP 4.3
## 3994 175 2011/10/1 上午 8:24:00 Total Calcium 7.3
## 3995 175 2011/10/1 上午 8:24:00 Creatinine 3.7
## 3996 175 2011/10/1 上午 8:24:00 Na 138.0
for (k in 1:nrow(subsubdat)) {
NAME = subsubdat[k,3]
position = which(NAME == levels.TESTNAME) + 2
submatrix[j, position] = subsubdat[k,4]
}
head(submatrix)
## PATNUMBER COLLECTIONDATE Albumin Albumin body fluid AST
## [1,] "175" "2011/10/1 上午 8:24:00" NA NA NA
## [2,] "175" "2011/10/5 下午 4:46:00" NA NA NA
## [3,] "175" "2011/10/6 上午 9:01:00" NA NA NA
## [4,] "175" "2011/10/8 上午 6:42:00" NA NA NA
## [5,] "175" "2011/11/10 上午 9:01:00" NA NA NA
## [6,] "175" "2011/11/10 下午 1:25:00" NA NA NA
## BUN BUN Fluid Cholesterol Fluid Creatinine Creatinine Fluid GLU(AC)
## [1,] NA NA NA "3.7" NA NA
## [2,] NA NA NA NA NA NA
## [3,] NA NA NA NA NA NA
## [4,] NA NA NA NA NA NA
## [5,] NA NA NA NA NA NA
## [6,] NA NA NA NA NA NA
## HDL-Cholesterol IP K LDL-Cholesterol Na Total Calcium
## [1,] NA "4.3" NA NA "138" "7.3"
## [2,] NA NA NA NA NA NA
## [3,] NA NA NA NA NA NA
## [4,] NA NA NA NA NA NA
## [5,] NA NA NA NA NA NA
## [6,] NA NA NA NA NA NA
## Total Cholesterol Triglyceride Triglycerol Fluid Uric Acid
## [1,] NA NA NA NA
## [2,] NA NA NA NA
## [3,] NA NA NA NA
## [4,] NA NA NA NA
## [5,] NA NA NA NA
## [6,] NA NA NA NA
## urine Calcium urine Phosphorus urine Potassium urine Sodium
## [1,] NA NA NA NA
## [2,] NA NA NA NA
## [3,] NA NA NA NA
## [4,] NA NA NA NA
## [5,] NA NA NA NA
## [6,] NA NA NA NA
## urine Uric Acid
## [1,] NA
## [2,] NA
## [3,] NA
## [4,] NA
## [5,] NA
## [6,] NA
for (j in 1:n.COLLECTIONDATE) {
subsubdat = subdat[subdat[,2]==levels.COLLECTIONDATE[j],]
for (k in 1:nrow(subsubdat)) {
NAME = subsubdat[k,3]
position = which(NAME == levels.TESTNAME) + 2
submatrix[j, position] = subsubdat[k,4]
}
}
head(submatrix)
## PATNUMBER COLLECTIONDATE Albumin Albumin body fluid AST
## [1,] "175" "2011/10/1 上午 8:24:00" NA NA NA
## [2,] "175" "2011/10/5 下午 4:46:00" NA NA NA
## [3,] "175" "2011/10/6 上午 9:01:00" NA NA NA
## [4,] "175" "2011/10/8 上午 6:42:00" NA NA NA
## [5,] "175" "2011/11/10 上午 9:01:00" NA NA NA
## [6,] "175" "2011/11/10 下午 1:25:00" NA NA NA
## BUN BUN Fluid Cholesterol Fluid Creatinine Creatinine Fluid GLU(AC)
## [1,] NA NA NA "3.7" NA NA
## [2,] NA NA NA "3.2" NA NA
## [3,] NA NA NA NA NA NA
## [4,] NA NA NA "3.4" NA NA
## [5,] NA NA NA NA NA NA
## [6,] NA NA NA NA NA NA
## HDL-Cholesterol IP K LDL-Cholesterol Na Total Calcium
## [1,] NA "4.3" NA NA "138" "7.3"
## [2,] NA NA NA NA "139" NA
## [3,] NA "4.5" NA NA NA "7.8"
## [4,] NA NA NA NA NA NA
## [5,] NA "4.5" NA NA NA "7.3"
## [6,] NA NA NA NA NA NA
## Total Cholesterol Triglyceride Triglycerol Fluid Uric Acid
## [1,] NA NA NA NA
## [2,] NA NA NA NA
## [3,] "342" "335" NA NA
## [4,] NA NA NA NA
## [5,] "342" "326" NA NA
## [6,] NA NA NA NA
## urine Calcium urine Phosphorus urine Potassium urine Sodium
## [1,] NA NA NA NA
## [2,] NA NA NA NA
## [3,] NA NA NA NA
## [4,] NA NA NA NA
## [5,] NA NA NA NA
## [6,] "0.6" "28.3" "39.1" "48"
## urine Uric Acid
## [1,] NA
## [2,] NA
## [3,] NA
## [4,] NA
## [5,] NA
## [6,] NA
– 下列這串程式碼可以獲得一個完整的submatrix
i = 1
subdat = dat[dat[,1]==levels.PATNUMBER[i],]
subdat[,2] = as.factor(as.character(subdat[,2]))
levels.COLLECTIONDATE = levels(subdat[,2])
n.COLLECTIONDATE = length(levels.COLLECTIONDATE)
n.COLLECTIONDATE
submatrix = matrix(NA, nrow = n.COLLECTIONDATE, ncol = n.TESTNAME+2)
colnames(submatrix) = c("PATNUMBER", "COLLECTIONDATE", levels.TESTNAME)
submatrix[,1] = levels.PATNUMBER[i]
submatrix[,2] = levels.COLLECTIONDATE
for (j in 1:n.COLLECTIONDATE) {
subsubdat = subdat[subdat[,2]==levels.COLLECTIONDATE[j],]
for (k in 1:nrow(subsubdat)) {
NAME = subsubdat[k,3]
position = which(NAME == levels.TESTNAME) + 2
submatrix[j, position] = subsubdat[k,4]
}
}
final.data = NULL
final.data
## NULL
final.data = rbind(final.data, submatrix)
head(final.data)
## PATNUMBER COLLECTIONDATE Albumin Albumin body fluid AST
## [1,] "175" "2011/10/1 上午 8:24:00" NA NA NA
## [2,] "175" "2011/10/5 下午 4:46:00" NA NA NA
## [3,] "175" "2011/10/6 上午 9:01:00" NA NA NA
## [4,] "175" "2011/10/8 上午 6:42:00" NA NA NA
## [5,] "175" "2011/11/10 上午 9:01:00" NA NA NA
## [6,] "175" "2011/11/10 下午 1:25:00" NA NA NA
## BUN BUN Fluid Cholesterol Fluid Creatinine Creatinine Fluid GLU(AC)
## [1,] NA NA NA "3.7" NA NA
## [2,] NA NA NA "3.2" NA NA
## [3,] NA NA NA NA NA NA
## [4,] NA NA NA "3.4" NA NA
## [5,] NA NA NA NA NA NA
## [6,] NA NA NA NA NA NA
## HDL-Cholesterol IP K LDL-Cholesterol Na Total Calcium
## [1,] NA "4.3" NA NA "138" "7.3"
## [2,] NA NA NA NA "139" NA
## [3,] NA "4.5" NA NA NA "7.8"
## [4,] NA NA NA NA NA NA
## [5,] NA "4.5" NA NA NA "7.3"
## [6,] NA NA NA NA NA NA
## Total Cholesterol Triglyceride Triglycerol Fluid Uric Acid
## [1,] NA NA NA NA
## [2,] NA NA NA NA
## [3,] "342" "335" NA NA
## [4,] NA NA NA NA
## [5,] "342" "326" NA NA
## [6,] NA NA NA NA
## urine Calcium urine Phosphorus urine Potassium urine Sodium
## [1,] NA NA NA NA
## [2,] NA NA NA NA
## [3,] NA NA NA NA
## [4,] NA NA NA NA
## [5,] NA NA NA NA
## [6,] "0.6" "28.3" "39.1" "48"
## urine Uric Acid
## [1,] NA
## [2,] NA
## [3,] NA
## [4,] NA
## [5,] NA
## [6,] NA
levels.TESTNAME = levels(dat[,3])
n.TESTNAME = length(levels.TESTNAME)
levels.PATNUMBER = levels(as.factor(dat[,1]))
n.PATNUMBER = length(levels.PATNUMBER)
final.data = NULL
for (i in 1:n.PATNUMBER) {
subdat = dat[dat[,1]==levels.PATNUMBER[i],]
subdat[,2] = as.factor(as.character(subdat[,2]))
levels.COLLECTIONDATE = levels(subdat[,2])
n.COLLECTIONDATE = length(levels.COLLECTIONDATE)
submatrix = matrix(NA, nrow = n.COLLECTIONDATE, ncol = n.TESTNAME+2)
colnames(submatrix) = c("PATNUMBER", "COLLECTIONDATE", levels.TESTNAME)
submatrix[,1] = levels.PATNUMBER[i]
submatrix[,2] = levels.COLLECTIONDATE
for (j in 1:n.COLLECTIONDATE) {
subsubdat = subdat[subdat[,2]==levels.COLLECTIONDATE[j],]
for (k in 1:nrow(subsubdat)) {
NAME = subsubdat[k,3]
position = which(NAME == levels.TESTNAME) + 2
submatrix[j, position] = subsubdat[k,4]
}
}
final.data = rbind(final.data, submatrix)
}
head(final.data)
– 這次,除了檔案更大以外,檔案的最後還有參考值。如果你的值位於參考值內,那就是正常,否則則是過高。
– 我們這次不要填數值,而是填入正常(TRUE)或異常(FALSE)!
dat = read.csv("data3_4.csv", header = TRUE, fileEncoding = 'CP950')
head(dat, 10)
## PATNUMBER SEX COLLECTIONDATE TESTNAME RESVALUE UNITS
## 1 180 1 2011/12/11 上午 5:10:00 Na 131.0 mmol/L
## 2 589 1 2011/12/11 上午 6:37:00 Creatinine 3.8 mg/dL
## 3 589 1 2011/12/11 上午 6:37:00 Na 138.0 mmol/L
## 4 1015 1 2011/12/12 上午 7:38:00 Total Cholesterol 158.0 mg/dL
## 5 1015 1 2011/12/12 上午 7:38:00 Creatinine 1.5 mg/dL
## 6 1015 1 2011/12/12 上午 7:38:00 Triglyceride 140.0 mg/dL
## 7 1015 1 2011/12/12 上午 7:38:00 Na 143.0 mmol/L
## 8 480 2 2011/12/12 上午 7:41:00 Triglyceride 153.0 mg/dL
## 9 480 2 2011/12/12 上午 7:41:00 Na 139.0 mmol/L
## 10 480 2 2011/12/12 上午 7:41:00 Total Cholesterol 211.0 mg/dL
## MINIMUM MAXIMUM
## 1 136.0 145.0
## 2 0.7 1.2
## 3 136.0 145.0
## 4 NA 200.0
## 5 0.7 1.2
## 6 NA 200.0
## 7 136.0 145.0
## 8 NA 200.0
## 9 136.0 145.0
## 10 NA 200.0
– 函數「Sys.sleep()」是讓系統休息,你不需要將他加入你的迴圈內
n = 100
pb = txtProgressBar(max = n, style=3)
for(i in 1:n) {
Sys.sleep(0.1)
setTxtProgressBar(pb, i)
}
close(pb)
levels.TESTNAME = levels(dat[,'TESTNAME'])
n.TESTNAME = length(levels.TESTNAME)
levels.PATNUMBER = levels(as.factor(dat[,'PATNUMBER']))
n.PATNUMBER = length(levels.PATNUMBER)
final.data = NULL
pb = txtProgressBar(max = n.PATNUMBER, style=3)
for (i in 1:n.PATNUMBER) {
subdat = dat[dat[,'PATNUMBER']==levels.PATNUMBER[i],]
subdat[,'COLLECTIONDATE'] = as.factor(as.character(subdat[,'COLLECTIONDATE']))
levels.COLLECTIONDATE = levels(subdat[,'COLLECTIONDATE'])
n.COLLECTIONDATE = length(levels.COLLECTIONDATE)
submatrix = matrix(NA, nrow = n.COLLECTIONDATE, ncol = n.TESTNAME+2)
colnames(submatrix) = c("PATNUMBER", "COLLECTIONDATE", levels.TESTNAME)
submatrix[,1] = levels.PATNUMBER[i]
submatrix[,2] = levels.COLLECTIONDATE
for (j in 1:n.COLLECTIONDATE) {
subsubdat = subdat[subdat[,'COLLECTIONDATE']==levels.COLLECTIONDATE[j],]
for (k in 1:nrow(subsubdat)) {
NAME = subsubdat[k,'TESTNAME']
position = which(NAME == levels.TESTNAME) + 2
VALUE = subsubdat[k,'RESVALUE']
MINIMUM = subsubdat[k,'MINIMUM']
MAXIMUM = subsubdat[k,'MAXIMUM']
if (is.na(MINIMUM)) {MINIMUM = -Inf}
if (is.na(MAXIMUM)) {MAXIMUM = Inf}
submatrix[j, position] = (VALUE >= MINIMUM & VALUE <= MAXIMUM)
}
}
final.data = rbind(final.data, submatrix)
setTxtProgressBar(pb, i)
}
close(pb)
head(final.data)